{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import time\n",
    "import random\n",
    "import jsonlines\n",
    "import pandas as pd\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question11_gen(data, num_questions_per_template, questions, question_id):\n",
    "    print(len(data))\n",
    "    new_data = data.loc[data[\"Cancelled\"]==True]\n",
    "    print(len(new_data))\n",
    "    candidate_list = []\n",
    "    for i in range(len(new_data)):\n",
    "        candidate_list.append([new_data.iloc[i][\"FlightDate\"], new_data.iloc[i][\"Origin\"]])\n",
    "    print(len(candidate_list))\n",
    "    flight_date = None\n",
    "    origin = None\n",
    "    row_idx = 0\n",
    "    while question_id < num_questions_per_template:\n",
    "        random_indices = random.randint(0, len(candidate_list)-1)\n",
    "        flightdate = candidate_list[random_indices][0]\n",
    "        origin = candidate_list[random_indices][1]\n",
    "        num_total = len(data.loc[(data[\"FlightDate\"] == flightdate) & (data[\"Origin\"] == origin)])\n",
    "        num_cancelled = len(data.loc[(new_data[\"FlightDate\"] == flightdate) & (data[\"Origin\"] == origin) & (data[\"Cancelled\"] == True)])\n",
    "        if num_cancelled > 0:\n",
    "            question = \"What percentage of the flights from {} were delayed on {}?\".format(origin, flightdate)\n",
    "            answer = \"{:.1f}\".format(num_cancelled / num_total * 100)+\"%\"\n",
    "            questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question12_gen(data, num_questions_per_template, questions, question_id):\n",
    "    origin_list = data[\"Origin\"].unique()\n",
    "    print(len(origin_list))\n",
    "    flightdate_list = data[\"FlightDate\"].unique()\n",
    "    print(len(flightdate_list))\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        origin_idx = random.randint(0, len(origin_list)-1)\n",
    "        flightdate_idx = random.randint(0, len(flightdate_list)-1)\n",
    "        origin = origin_list[origin_idx]\n",
    "        flightdate = flightdate_list[flightdate_idx]\n",
    "        sub_table = data.loc[(data[\"Origin\"] == origin) & (data[\"FlightDate\"] == flightdate)]\n",
    "        total_delay = 0\n",
    "        num_delay = 0\n",
    "        for row_id in range(len(sub_table)):\n",
    "            if not math.isnan(sub_table.iloc[row_id][\"DepDelayMinutes\"]):\n",
    "                num_delay += 1\n",
    "                total_delay += float(sub_table.iloc[row_id][\"DepDelayMinutes\"])\n",
    "        if num_delay != 0:\n",
    "            avg_delay = total_delay / len(sub_table)\n",
    "            question = \"What is the average delay time of all the flights that departed from {} on {}?\".format(origin, flightdate)\n",
    "            answer = \"{:.1f}\".format(avg_delay)\n",
    "            questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question13_gen(data, num_questions_per_template, questions, question_id):\n",
    "    flightdate_list = data[\"FlightDate\"].unique()\n",
    "    # for date in flightdate_list:\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        date_idx = random.randint(0, len(flightdate_list)-1)\n",
    "        date = flightdate_list[date_idx]\n",
    "        num_diverted = len(data.loc[(data[\"FlightDate\"] == date) & (data[\"Diverted\"] == True)])\n",
    "        num_total = len(data.loc[(data[\"FlightDate\"] == date)])\n",
    "        question = \"How many flights were diverted on {}?\".format(date)\n",
    "        answer = str(num_diverted)\n",
    "        questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "        question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question14_gen(data, num_questions_per_template, questions, question_id):\n",
    "    flightdate_list = data[\"FlightDate\"].unique()\n",
    "    # for date in flightdate_list:\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        date_idx = random.randint(0, len(flightdate_list)-1)\n",
    "        date = flightdate_list[date_idx]\n",
    "        num = len(data.loc[(data[\"FlightDate\"] == date) & (data[\"Distance\"] > 300.0)])\n",
    "        question = \"How many flights with a distance greater than 500 miles on {}?\".format(date)\n",
    "        answer = str(num)\n",
    "        questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "        question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question15_gen(data, num_questions_per_template, questions, question_id):\n",
    "    # What is the average airtime of the flights from {origin} to {dest} host by {airline}?\n",
    "    airline_list = data[\"Airline\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        airline_idx = random.randint(0, len(airline_list)-1)\n",
    "        airline = airline_list[airline_idx]\n",
    "        sub_table = data.loc[(data[\"Airline\"] == airline)]\n",
    "\n",
    "        origin_list = sub_table[\"Origin\"].unique()\n",
    "        origin_idx = random.randint(0, len(origin_list)-1)\n",
    "        origin = origin_list[origin_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"Origin\"] == origin)]\n",
    "\n",
    "        dest_list = sub_table[\"Dest\"].unique()\n",
    "        dest_idx = random.randint(0, len(dest_list)-1)\n",
    "        dest = dest_list[dest_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"Dest\"] == dest)]\n",
    "        if len(sub_table) != 0:\n",
    "            sum_airtime = 0\n",
    "            num_airtime = 0\n",
    "            for row_id in range(len(sub_table)):\n",
    "                if not math.isnan(sub_table.iloc[row_id][\"AirTime\"]):\n",
    "                    sum_airtime += sub_table.iloc[row_id][\"AirTime\"]\n",
    "                    num_airtime += 1\n",
    "            if num_airtime != 0:\n",
    "                avg_airtime = sum_airtime / num_airtime\n",
    "                question = \"What is the average airtime of the flights from {} to {} host by {}?\".format(origin, dest, airline)\n",
    "                answer = \"{:.1f} minutes\".format(avg_airtime)\n",
    "                questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "                question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question16_gen(data, num_questions_per_template, questions, question_id):\n",
    "    airline_list = data[\"Airline\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        airline_idx = random.randint(0, len(airline_list)-1)\n",
    "        airline = airline_list[airline_idx]\n",
    "        sub_table = data.loc[(data[\"Airline\"] == airline)]\n",
    "\n",
    "        origin_list = sub_table[\"Origin\"].unique()\n",
    "        origin_idx = random.randint(0, len(origin_list)-1)\n",
    "        origin = origin_list[origin_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"Origin\"] == origin)]\n",
    "\n",
    "        dest_list = sub_table[\"Dest\"].unique()\n",
    "        dest_idx = random.randint(0, len(dest_list)-1)\n",
    "        dest = dest_list[dest_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"Dest\"] == dest)]\n",
    "        if len(sub_table) != 0:\n",
    "            num_airlines = len(sub_table[\"Flight_Number_Marketing_Airline\"].unique())\n",
    "            question = \"How many flights from {} to {} host by {}?\".format(origin, dest, airline)\n",
    "            answer = num_airlines\n",
    "            questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question17_gen(data, num_questions_per_template, questions, question_id):\n",
    "    number_list = data[\"Flight_Number_Marketing_Airline\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        number_idx = random.randint(0, len(number_list)-1)\n",
    "        number = number_list[number_idx]\n",
    "        sub_table = data.loc[(data[\"Flight_Number_Marketing_Airline\"] == number)]\n",
    "        carrier_list = sub_table[\"IATA_Code_Marketing_Airline\"].unique()\n",
    "        carrier_idx = random.randint(0, len(carrier_list)-1)\n",
    "        carrier = carrier_list[carrier_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"IATA_Code_Marketing_Airline\"] == carrier)]\n",
    "        if len(sub_table) != 0:\n",
    "            avg_airtime = 0\n",
    "            num_airtime = 0\n",
    "            for row_id in range(len(sub_table)):\n",
    "                if not math.isnan(sub_table.iloc[row_id][\"AirTime\"]):\n",
    "                    avg_airtime += sub_table.iloc[row_id][\"AirTime\"]\n",
    "                    num_airtime += 1\n",
    "            if num_airtime != 0:\n",
    "                avg_airtime = avg_airtime / num_airtime\n",
    "                question = \"What is the average flight time of {}{}?\".format(carrier, number)\n",
    "                answer = \"{:.1f} minutes\".format(avg_airtime)\n",
    "                questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "                question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question18_gen(data, num_questions_per_template, questions, question_id):\n",
    "    # What is the fastest flight from {origin} to {dest} on {DayofWeek}?\n",
    "    origin_list = data[\"Origin\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        origin_idx = random.randint(0, len(origin_list)-1)\n",
    "        origin = origin_list[origin_idx]\n",
    "        sub_table = data.loc[(data[\"Origin\"] == origin)]\n",
    "\n",
    "        dest_list = sub_table[\"Dest\"].unique()\n",
    "        dest_idx = random.randint(0, len(dest_list)-1)\n",
    "        dest = dest_list[dest_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"Dest\"] == dest)]\n",
    "\n",
    "        dayofweek_list = sub_table[\"DayOfWeek\"].unique()\n",
    "        dayofweek_idx = random.randint(0, len(dayofweek_list)-1)\n",
    "        dayofweek = dayofweek_list[dayofweek_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"DayOfWeek\"] == dayofweek)]\n",
    "        if len(sub_table) != 0:\n",
    "            min_airtime = 100000\n",
    "            for row_id in range(len(sub_table)):\n",
    "                if sub_table.iloc[row_id][\"AirTime\"] < min_airtime:\n",
    "                    min_airtime = sub_table.iloc[row_id][\"AirTime\"]\n",
    "                    min_carrier = sub_table.iloc[row_id][\"IATA_Code_Marketing_Airline\"]\n",
    "                    min_number = sub_table.iloc[row_id][\"Flight_Number_Marketing_Airline\"]\n",
    "            question = \"What is the fastest flight from {} to {} on {}?\".format(origin, dest, dayofweek)\n",
    "            answer = \"{}{}\".format(min_carrier, min_number)\n",
    "            questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question19_gen(data, num_questions_per_template, questions, question_id):\n",
    "    # What is the average speed of {carrier}{number} from {origin} to {dest}?\n",
    "    origin_list = data[\"Origin\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        origin_idx = random.randint(0, len(origin_list)-1)\n",
    "        origin = origin_list[origin_idx]\n",
    "        sub_table = data.loc[(data[\"Origin\"] == origin)]\n",
    "\n",
    "        dest_list = sub_table[\"Dest\"].unique()\n",
    "        dest_idx = random.randint(0, len(dest_list)-1)\n",
    "        dest = dest_list[dest_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"Dest\"] == dest)]\n",
    "\n",
    "        # if len(sub_table) != 0:\n",
    "        number_list = sub_table[\"Flight_Number_Marketing_Airline\"].unique()\n",
    "        number_idx = random.randint(0, len(number_list)-1)\n",
    "        number = number_list[number_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"Flight_Number_Marketing_Airline\"] == number)]\n",
    "        carrier_list = sub_table[\"IATA_Code_Marketing_Airline\"].unique()\n",
    "        carrier_idx = random.randint(0, len(carrier_list)-1)\n",
    "        carrier = carrier_list[carrier_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"IATA_Code_Marketing_Airline\"] == carrier)]\n",
    "        if len(sub_table) != 0:\n",
    "            question = \"What is the average speed of {}{} from {} to {}?\".format(carrier, number, origin, dest)\n",
    "            avg_speed = 0\n",
    "            num_speed = 0\n",
    "            for row_id in range(len(sub_table)):\n",
    "                if not math.isnan(sub_table.iloc[row_id][\"AirTime\"]):\n",
    "                    avg_speed += sub_table.iloc[row_id][\"Distance\"] / sub_table.iloc[row_id][\"AirTime\"]\n",
    "                    num_speed += 1\n",
    "            if num_speed != 0:\n",
    "                avg_speed = avg_speed / num_speed\n",
    "                answer = \"{:.1f} miles per minute\".format(avg_speed)\n",
    "                questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "                question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question20_gen(data, num_questions_per_template, questions, question_id):\n",
    "    # What is the total number of flights operated by {airline} on {date}?\n",
    "    airline_list = data[\"Airline\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        airline_idx = random.randint(0, len(airline_list)-1)\n",
    "        airline = airline_list[airline_idx]\n",
    "        sub_table = data.loc[(data[\"Airline\"] == airline)]\n",
    "\n",
    "        flightdate_list = sub_table[\"FlightDate\"].unique()\n",
    "        flightdate_idx = random.randint(0, len(flightdate_list)-1)\n",
    "        flightdate = flightdate_list[flightdate_idx]\n",
    "        sub_table = sub_table.loc[(sub_table[\"FlightDate\"] == flightdate)]\n",
    "        if len(sub_table) != 0:\n",
    "            question = \"What is the total number of flights operated by {} on {}?\".format(airline, flightdate)\n",
    "            answer = str(len(sub_table))\n",
    "            questions.append({\"qid\": \"hard-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_time = time.time()\n",
    "file_path = \"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/flights/Combined_Flights_2022.csv\"\n",
    "data = pd.read_csv(file_path)\n",
    "num_questions_per_template = 10\n",
    "question_id = 0\n",
    "questions = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4078318\n",
      "123192\n",
      "123192\n",
      "[{'qid': 'hard-flight-0000', 'question': 'What percentage of the flights from PIT were delayed on 2022-05-17?', 'answer': '1.7%'}, {'qid': 'hard-flight-0001', 'question': 'What percentage of the flights from JAC were delayed on 2022-02-24?', 'answer': '11.1%'}, {'qid': 'hard-flight-0002', 'question': 'What percentage of the flights from DEN were delayed on 2022-02-02?', 'answer': '15.0%'}, {'qid': 'hard-flight-0003', 'question': 'What percentage of the flights from HPN were delayed on 2022-01-30?', 'answer': '37.1%'}, {'qid': 'hard-flight-0004', 'question': 'What percentage of the flights from STT were delayed on 2022-03-19?', 'answer': '4.0%'}, {'qid': 'hard-flight-0005', 'question': 'What percentage of the flights from ORD were delayed on 2022-06-13?', 'answer': '6.9%'}, {'qid': 'hard-flight-0006', 'question': 'What percentage of the flights from LAW were delayed on 2022-04-24?', 'answer': '50.0%'}, {'qid': 'hard-flight-0007', 'question': 'What percentage of the flights from PIT were delayed on 2022-01-28?', 'answer': '9.6%'}, {'qid': 'hard-flight-0008', 'question': 'What percentage of the flights from MDW were delayed on 2022-01-04?', 'answer': '13.8%'}, {'qid': 'hard-flight-0009', 'question': 'What percentage of the flights from CLE were delayed on 2022-04-09?', 'answer': '4.8%'}]\n",
      "Time elapsed for Question 11: 640.6908073425293 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 11\n",
    "questions, question_id = question11_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 11: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "375\n",
      "212\n",
      "[{'qid': 'hard-flight-0010', 'question': 'What is the average delay time of all the flights that departed from ERI on 2022-01-18?', 'answer': '23.5'}, {'qid': 'hard-flight-0011', 'question': 'What is the average delay time of all the flights that departed from PBG on 2022-07-31?', 'answer': '0.0'}, {'qid': 'hard-flight-0012', 'question': 'What is the average delay time of all the flights that departed from TBN on 2022-05-23?', 'answer': '0.0'}, {'qid': 'hard-flight-0013', 'question': 'What is the average delay time of all the flights that departed from SUX on 2022-07-12?', 'answer': '0.0'}, {'qid': 'hard-flight-0014', 'question': 'What is the average delay time of all the flights that departed from BRW on 2022-03-09?', 'answer': '19.0'}, {'qid': 'hard-flight-0015', 'question': 'What is the average delay time of all the flights that departed from PSG on 2022-02-06?', 'answer': '0.0'}, {'qid': 'hard-flight-0016', 'question': 'What is the average delay time of all the flights that departed from SCK on 2022-06-03?', 'answer': '50.5'}, {'qid': 'hard-flight-0017', 'question': 'What is the average delay time of all the flights that departed from PSG on 2022-07-27?', 'answer': '0.0'}, {'qid': 'hard-flight-0018', 'question': 'What is the average delay time of all the flights that departed from FAT on 2022-06-10?', 'answer': '25.2'}, {'qid': 'hard-flight-0019', 'question': 'What is the average delay time of all the flights that departed from RDU on 2022-04-02?', 'answer': '22.9'}]\n",
      "Time elapsed for Question 12: 999.0029759407043 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 12\n",
    "questions, question_id = question12_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 12: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-flight-0020', 'question': 'How many flights were diverted on 2022-01-14?', 'answer': '34'}, {'qid': 'hard-flight-0021', 'question': 'How many flights were diverted on 2022-01-05?', 'answer': '30'}, {'qid': 'hard-flight-0022', 'question': 'How many flights were diverted on 2022-05-11?', 'answer': '33'}, {'qid': 'hard-flight-0023', 'question': 'How many flights were diverted on 2022-04-23?', 'answer': '31'}, {'qid': 'hard-flight-0024', 'question': 'How many flights were diverted on 2022-01-19?', 'answer': '54'}, {'qid': 'hard-flight-0025', 'question': 'How many flights were diverted on 2022-04-18?', 'answer': '26'}, {'qid': 'hard-flight-0026', 'question': 'How many flights were diverted on 2022-03-01?', 'answer': '16'}, {'qid': 'hard-flight-0027', 'question': 'How many flights were diverted on 2022-04-02?', 'answer': '41'}, {'qid': 'hard-flight-0028', 'question': 'How many flights were diverted on 2022-07-10?', 'answer': '69'}, {'qid': 'hard-flight-0029', 'question': 'How many flights were diverted on 2022-01-27?', 'answer': '44'}]\n",
      "Time elapsed for Question 13: 1005.2014305591583 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 13\n",
    "questions, question_id = question13_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 13: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-flight-0030', 'question': 'How many flights with a distance greater than 500 miles on 2022-01-02?', 'answer': '16588'}, {'qid': 'hard-flight-0031', 'question': 'How many flights with a distance greater than 500 miles on 2022-01-09?', 'answer': '15583'}, {'qid': 'hard-flight-0032', 'question': 'How many flights with a distance greater than 500 miles on 2022-03-18?', 'answer': '16556'}, {'qid': 'hard-flight-0033', 'question': 'How many flights with a distance greater than 500 miles on 2022-02-25?', 'answer': '16354'}, {'qid': 'hard-flight-0034', 'question': 'How many flights with a distance greater than 500 miles on 2022-04-15?', 'answer': '16515'}, {'qid': 'hard-flight-0035', 'question': 'How many flights with a distance greater than 500 miles on 2022-03-15?', 'answer': '14833'}, {'qid': 'hard-flight-0036', 'question': 'How many flights with a distance greater than 500 miles on 2022-01-03?', 'answer': '16685'}, {'qid': 'hard-flight-0037', 'question': 'How many flights with a distance greater than 500 miles on 2022-01-09?', 'answer': '15583'}, {'qid': 'hard-flight-0038', 'question': 'How many flights with a distance greater than 500 miles on 2022-05-21?', 'answer': '14135'}, {'qid': 'hard-flight-0039', 'question': 'How many flights with a distance greater than 500 miles on 2022-07-02?', 'answer': '15166'}]\n",
      "Time elapsed for Question 14: 1007.5210545063019 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 14\n",
    "questions, question_id = question14_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 14: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-flight-0040', 'question': 'What is the average airtime of the flights from BZN to CLT host by American Airlines Inc.?', 'answer': '198.2 minutes'}, {'qid': 'hard-flight-0041', 'question': 'What is the average airtime of the flights from MKE to CLT host by Comair Inc.?', 'answer': '95.2 minutes'}, {'qid': 'hard-flight-0042', 'question': 'What is the average airtime of the flights from ITO to OGG host by Hawaiian Airlines Inc.?', 'answer': '24.9 minutes'}, {'qid': 'hard-flight-0043', 'question': 'What is the average airtime of the flights from CLT to MOB host by Capital Cargo International?', 'answer': '85.6 minutes'}, {'qid': 'hard-flight-0044', 'question': 'What is the average airtime of the flights from CHO to CLT host by Capital Cargo International?', 'answer': '48.3 minutes'}, {'qid': 'hard-flight-0045', 'question': 'What is the average airtime of the flights from AMA to DFW host by Mesa Airlines Inc.?', 'answer': '49.3 minutes'}, {'qid': 'hard-flight-0046', 'question': 'What is the average airtime of the flights from SMF to OGG host by Hawaiian Airlines Inc.?', 'answer': '307.7 minutes'}, {'qid': 'hard-flight-0047', 'question': 'What is the average airtime of the flights from ORF to ORD host by Envoy Air?', 'answer': '114.2 minutes'}, {'qid': 'hard-flight-0048', 'question': 'What is the average airtime of the flights from MSP to MYR host by Spirit Air Lines?', 'answer': '130.5 minutes'}, {'qid': 'hard-flight-0049', 'question': 'What is the average airtime of the flights from DLH to ORD host by Air Wisconsin Airlines Corp?', 'answer': '72.0 minutes'}]\n",
      "Time elapsed for Question 15: 1010.1299157142639 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 15\n",
    "questions, question_id = question15_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 15: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-flight-0050', 'question': 'How many flights from SLC to EWR host by United Air Lines Inc.?', 'answer': 7}, {'qid': 'hard-flight-0051', 'question': 'How many flights from JFK to IND host by Endeavor Air Inc.?', 'answer': 5}, {'qid': 'hard-flight-0052', 'question': 'How many flights from MSO to DEN host by Frontier Airlines Inc.?', 'answer': 1}, {'qid': 'hard-flight-0053', 'question': 'How many flights from ATL to CLE host by Spirit Air Lines?', 'answer': 1}, {'qid': 'hard-flight-0054', 'question': 'How many flights from ATW to ORD host by Air Wisconsin Airlines Corp?', 'answer': 13}, {'qid': 'hard-flight-0055', 'question': 'How many flights from FNT to ORD host by GoJet Airlines, LLC d/b/a United Express?', 'answer': 3}, {'qid': 'hard-flight-0056', 'question': 'How many flights from AVL to CLT host by Comair Inc.?', 'answer': 19}, {'qid': 'hard-flight-0057', 'question': 'How many flights from CLE to ORD host by Air Wisconsin Airlines Corp?', 'answer': 3}, {'qid': 'hard-flight-0058', 'question': 'How many flights from ITO to LAX host by United Air Lines Inc.?', 'answer': 2}, {'qid': 'hard-flight-0059', 'question': 'How many flights from DSM to CLT host by American Airlines Inc.?', 'answer': 6}]\n",
      "Time elapsed for Question 16: 1012.4740109443665 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 16\n",
    "questions, question_id = question16_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 16: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-flight-0060', 'question': 'What is the average flight time of G41015?', 'answer': '192.5 minutes'}, {'qid': 'hard-flight-0061', 'question': 'What is the average flight time of NK2062?', 'answer': '119.1 minutes'}, {'qid': 'hard-flight-0062', 'question': 'What is the average flight time of AA5297?', 'answer': '73.0 minutes'}, {'qid': 'hard-flight-0063', 'question': 'What is the average flight time of G4886?', 'answer': '144.0 minutes'}, {'qid': 'hard-flight-0064', 'question': 'What is the average flight time of G41931?', 'answer': '98.6 minutes'}, {'qid': 'hard-flight-0065', 'question': 'What is the average flight time of AA3262?', 'answer': '46.7 minutes'}, {'qid': 'hard-flight-0066', 'question': 'What is the average flight time of WN6267?', 'answer': '136.2 minutes'}, {'qid': 'hard-flight-0067', 'question': 'What is the average flight time of WN6679?', 'answer': '147.0 minutes'}, {'qid': 'hard-flight-0068', 'question': 'What is the average flight time of DL1271?', 'answer': '84.4 minutes'}, {'qid': 'hard-flight-0069', 'question': 'What is the average flight time of DL4939?', 'answer': '70.6 minutes'}]\n",
      "Time elapsed for Question 17: 1012.6966557502747 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 17\n",
    "questions, question_id = question17_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 17: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-flight-0070', 'question': 'What is the fastest flight from BFL to DEN on 6?', 'answer': 'UA5839'}, {'qid': 'hard-flight-0071', 'question': 'What is the fastest flight from DVL to JMS on 2?', 'answer': 'UA5045'}, {'qid': 'hard-flight-0072', 'question': 'What is the fastest flight from SLC to BIL on 1?', 'answer': 'DL2178'}, {'qid': 'hard-flight-0073', 'question': 'What is the fastest flight from PIA to FLL on 4?', 'answer': 'G4899'}, {'qid': 'hard-flight-0074', 'question': 'What is the fastest flight from AVL to PIE on 6?', 'answer': 'G4504'}, {'qid': 'hard-flight-0075', 'question': 'What is the fastest flight from BIH to SFO on 3?', 'answer': 'UA4639'}, {'qid': 'hard-flight-0076', 'question': 'What is the fastest flight from ERI to CLT on 4?', 'answer': 'AA6096'}, {'qid': 'hard-flight-0077', 'question': 'What is the fastest flight from AKN to ANC on 3?', 'answer': 'AS2134'}, {'qid': 'hard-flight-0078', 'question': 'What is the fastest flight from DTW to CWA on 4?', 'answer': 'DL5409'}, {'qid': 'hard-flight-0079', 'question': 'What is the fastest flight from SRQ to CLE on 1?', 'answer': 'F92430'}]\n",
      "Time elapsed for Question 18: 1137.4875254631042 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 18\n",
    "questions, question_id = question18_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 18: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-flight-0080', 'question': 'What is the average speed of DL4130 from HPN to DTW?', 'answer': '6.1 miles per minute'}, {'qid': 'hard-flight-0081', 'question': 'What is the average speed of WN1507 from COS to MDW?', 'answer': '7.9 miles per minute'}, {'qid': 'hard-flight-0082', 'question': 'What is the average speed of AA5272 from GPT to CLT?', 'answer': '7.4 miles per minute'}, {'qid': 'hard-flight-0083', 'question': 'What is the average speed of AA4086 from BPT to DFW?', 'answer': '5.3 miles per minute'}, {'qid': 'hard-flight-0084', 'question': 'What is the average speed of UA6124 from HSV to ORD?', 'answer': '5.7 miles per minute'}, {'qid': 'hard-flight-0085', 'question': 'What is the average speed of DL4239 from LSE to MSP?', 'answer': '4.1 miles per minute'}, {'qid': 'hard-flight-0086', 'question': 'What is the average speed of AA5165 from TRI to CLT?', 'answer': '4.0 miles per minute'}, {'qid': 'hard-flight-0087', 'question': 'What is the average speed of DL2993 from GRB to ATL?', 'answer': '7.3 miles per minute'}, {'qid': 'hard-flight-0088', 'question': 'What is the average speed of DL4268 from IMT to MSP?', 'answer': '5.3 miles per minute'}, {'qid': 'hard-flight-0089', 'question': 'What is the average speed of UA5104 from JMS to DVL?', 'answer': '3.8 miles per minute'}]\n",
      "Time elapsed for Question 19: 1143.2772855758667 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 19\n",
    "questions, question_id = question19_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 19: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-flight-0090', 'question': 'What is the total number of flights operated by GoJet Airlines, LLC d/b/a United Express on 2022-02-01?', 'answer': '177'}, {'qid': 'hard-flight-0091', 'question': 'What is the total number of flights operated by United Air Lines Inc. on 2022-05-27?', 'answer': '1765'}, {'qid': 'hard-flight-0092', 'question': 'What is the total number of flights operated by Commutair Aka Champlain Enterprises, Inc. on 2022-03-24?', 'answer': '195'}, {'qid': 'hard-flight-0093', 'question': 'What is the total number of flights operated by Hawaiian Airlines Inc. on 2022-01-11?', 'answer': '190'}, {'qid': 'hard-flight-0094', 'question': 'What is the total number of flights operated by Hawaiian Airlines Inc. on 2022-06-04?', 'answer': '201'}, {'qid': 'hard-flight-0095', 'question': 'What is the total number of flights operated by Allegiant Air on 2022-04-12?', 'answer': '90'}, {'qid': 'hard-flight-0096', 'question': 'What is the total number of flights operated by Frontier Airlines Inc. on 2022-04-10?', 'answer': '453'}, {'qid': 'hard-flight-0097', 'question': 'What is the total number of flights operated by American Airlines Inc. on 2022-06-08?', 'answer': '2532'}, {'qid': 'hard-flight-0098', 'question': 'What is the total number of flights operated by Alaska Airlines Inc. on 2022-03-02?', 'answer': '567'}, {'qid': 'hard-flight-0099', 'question': 'What is the total number of flights operated by Commutair Aka Champlain Enterprises, Inc. on 2022-02-18?', 'answer': '226'}]\n",
      "Time elapsed for Question 20: 1147.307672739029 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 20\n",
    "questions, question_id = question20_gen(data, num_questions_per_template, questions, question_id)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 20: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "with jsonlines.open('/<YOUR_OWN_PATH>/ToolQA/data/questions/hard/flight-hard.jsonl', mode='w') as writer:\n",
    "    for row in questions:\n",
    "        writer.write(row)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
